1 Effect of UPSTM-Based Decorrelation on Feature Discovery

1.0.1 Loading the libraries

library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)

op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)

1.1 Material and Methods

About Dataset Context The principal diagnosis method for SARS-CoV-2 is a PCR technique, which allows for the detection of a genetic material of a pathogen or microorganism and it has high specificity, sensitivity and helps diagnosing even in the first stages of infection. However, it is not the fastest method to use in this situation and it’s very time consuming.

Raman spectroscopy could be used as a cheap and quick method to diagnose infection by SARS-CoV-2.

Content Inside this dataset there’re three files, each containing the raw spectra of healthy and infected subjects, unknown-state subjects and the probe used with saline solution inside. The data was in MATLAB’s proprietary format and was transformed to CSV using Python’s Pandas library.

Source The source of this data was found in the following DOI: https://doi.org/10.6084/m9.figshare.12159924.v1

Acknowledgements If used cite: Yin, Gang; Li, Lintao; Lu, Shun; Yin, Yu; Su, Yuanzhang; Zeng, Yilan; et al. (2020): Data and code on serum Raman spectroscopy as an efficient primary screening of coronavirus disease in 2019 (COVID-19). figshare. Dataset. https://doi.org/10.6084/m9.figshare.12159924.v1

1.2 Data: The COVID_19 Data-Set

The data to process is described in:

https://www.kaggle.com/datasets/sfran96/raman-spectroscopy-for-detecting-covid19?select=covid_and_healthy_spectra.csv

I added a column to the data identifying the repeated experiments.


SalivaIR <- read.csv("~/GitHub/LatentBiomarkers/Data/RAMANCOVID19/covid_and_healthy_spectra.csv")
SalivaIR$X400 <- NULL
SalivaIR$RepID <- rep(c(1:3),103) 
SalivaIR_set1 <- subset(SalivaIR,RepID==1)
SalivaIR_set1$RepID <- NULL
diagnos <- SalivaIR_set1$diagnostic
SalivaIR_set1$diagnostic <- NULL

SalivaIR_set2 <- subset(SalivaIR,RepID==2)
SalivaIR_set2$RepID <- NULL
SalivaIR_set2$diagnostic <- NULL

SalivaIR_set3 <- subset(SalivaIR,RepID==3)
SalivaIR_set3$RepID <- NULL
SalivaIR_set3$diagnostic <- NULL

SalivaIR_Avg <- (SalivaIR_set1 + SalivaIR_set2 + SalivaIR_set3)/3

SalivaIR_Avg$class <- 1*(diagnos=="SARS-CoV-2")

pander::pander(table(SalivaIR_Avg$class))
0 1
50 53

1.2.0.1 Standarize the names for the reporting

studyName <- "IRSaliva_2"
dataframe <- SalivaIR_Avg
outcome <- "class"

TopVariables <- 10

thro <- 0.80
cexheat = 0.15

1.3 Generaring the report

1.3.1 Libraries

Some libraries

library(psych)
library(whitening)
library("vioplot")
library("rpart")

1.3.2 Data specs

pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
rows col
103 899
pander::pander(table(dataframe[,outcome]))
0 1
50 53

varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]

largeSet <- length(varlist) > 1500 

1.3.3 Scaling the data

Scaling and removing near zero variance columns and highly co-linear(r>0.99999) columns


  ### Some global cleaning
  sdiszero <- apply(dataframe,2,sd) > 1.0e-16
  dataframe <- dataframe[,sdiszero]

  varlist <- colnames(dataframe)[colnames(dataframe) != outcome]
  tokeep <- c(as.character(correlated_Remove(dataframe,varlist,thr=0.99999)),outcome)
  dataframe <- dataframe[,tokeep]

  varlist <- colnames(dataframe)
  varlist <- varlist[varlist != outcome]
  
  iscontinous <- sapply(apply(dataframe,2,unique),length) >= 5 ## Only variables with enough samples



dataframeScaled <- FRESAScale(dataframe,method="OrderLogit")$scaledData

1.4 The heatmap of the data

numsub <- nrow(dataframe)
if (numsub > 1000) numsub <- 1000


if (!largeSet)
{

  hm <- heatMaps(data=dataframeScaled[1:numsub,],
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 xlab="Feature",
                 ylab="Sample",
                 srtCol=45,
                 srtRow=45,
                 cexCol=cexheat,
                 cexRow=cexheat
                 )
  par(op)
}

1.4.0.1 Correlation Matrix of the Data

The heat map of the data


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  #cormat <- Rfast::cora(as.matrix(dataframe[,varlist]),large=TRUE)
  cormat <- cor(dataframe[,varlist],method="pearson")
  cormat[is.na(cormat)] <- 0
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Original Correlation",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="|Pearson Correlation|",
                    xlab="Feature", ylab="Feature")
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

[1] 0.9991348

1.5 The decorrelation


DEdataframe <- IDeA(dataframe,verbose=TRUE,thr=thro)
#> 
#>  X1453 
#>      X402      X405      X407      X410      X412      X415 
#> 0.0741573 0.5325843 0.7382022 0.7337079 0.6910112 0.7101124 
#> 
#>  Included: 890 , Uni p: 0.0001685393 , Base Size: 1 , Rcrit: 0.3464623 
#> 
#> 
 1 <R=0.999,thr=0.950>, Top: 78< 44 >[Fa= 78 ]( 78 , 390 , 0 ),<|><>Tot Used: 468 , Added: 390 , Zero Std: 0 , Max Cor: 0.999
#> 
 2 <R=0.999,thr=0.950>, Top: 63< 7 >[Fa= 141 ]( 63 , 127 , 78 ),<|><>Tot Used: 609 , Added: 127 , Zero Std: 0 , Max Cor: 0.996
#> 
 3 <R=0.996,thr=0.950>, Top: 23< 3 >[Fa= 164 ]( 23 , 28 , 141 ),<|><>Tot Used: 654 , Added: 28 , Zero Std: 0 , Max Cor: 0.982
#> 
 4 <R=0.982,thr=0.950>, Top: 4< 2 >[Fa= 168 ]( 4 , 5 , 164 ),<|><>Tot Used: 663 , Added: 5 , Zero Std: 0 , Max Cor: 0.980
#> 
 5 <R=0.980,thr=0.950>, Top: 1< 1 >[Fa= 169 ]( 1 , 1 , 168 ),<|><>Tot Used: 665 , Added: 1 , Zero Std: 0 , Max Cor: 0.950
#> 
 6 <R=0.950,thr=0.900>, Top: 99< 10 >[Fa= 213 ]( 93 , 191 , 169 ),<|><>Tot Used: 754 , Added: 191 , Zero Std: 0 , Max Cor: 0.990
#> 
 7 <R=0.990,thr=0.950>, Top: 4< 1 >[Fa= 214 ]( 4 , 4 , 213 ),<|><>Tot Used: 754 , Added: 4 , Zero Std: 0 , Max Cor: 0.949
#> 
 8 <R=0.949,thr=0.900>, Top: 32< 1 >[Fa= 233 ]( 32 , 43 , 214 ),<|><>Tot Used: 787 , Added: 43 , Zero Std: 0 , Max Cor: 0.955
#> 
 9 <R=0.955,thr=0.950>, Top: 1< 1 >[Fa= 233 ]( 1 , 1 , 233 ),<|><>Tot Used: 788 , Added: 1 , Zero Std: 0 , Max Cor: 0.949
#> 
 10 <R=0.949,thr=0.900>, Top: 7< 1 >[Fa= 237 ]( 7 , 7 , 233 ),<|><>Tot Used: 797 , Added: 7 , Zero Std: 0 , Max Cor: 0.900
#> 
 11 <R=0.900,thr=0.800>, Top: 83< 2 >[Fa= 267 ]( 79 , 167 , 237 ),<|><>Tot Used: 833 , Added: 167 , Zero Std: 0 , Max Cor: 0.924
#> 
 12 <R=0.924,thr=0.900>, Top: 2< 1 >[Fa= 267 ]( 2 , 2 , 267 ),<|><>Tot Used: 834 , Added: 2 , Zero Std: 0 , Max Cor: 0.899
#> 
 13 <R=0.899,thr=0.800>, Top: 33< 1 >[Fa= 282 ]( 31 , 56 , 267 ),<|><>Tot Used: 860 , Added: 56 , Zero Std: 0 , Max Cor: 0.892
#> 
 14 <R=0.892,thr=0.800>, Top: 5< 1 >[Fa= 285 ]( 5 , 5 , 282 ),<|><>Tot Used: 866 , Added: 5 , Zero Std: 0 , Max Cor: 0.820
#> 
 15 <R=0.820,thr=0.800>, Top: 1< 1 >[Fa= 286 ]( 1 , 1 , 285 ),<|><>Tot Used: 866 , Added: 1 , Zero Std: 0 , Max Cor: 0.800
#> 
 16 <R=0.800,thr=0.800>
#> 
 [ 16 ], 0.799788 Decor Dimension: 866 Nused: 866 . Cor to Base: 328 , ABase: 890 , Outcome Base: 0 
#> 
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]

pander::pander(sum(apply(dataframe[,varlist],2,var)))

0.0708

pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))

0.00393

pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))

4.18

pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))

2.97


varratio <- attr(DEdataframe,"VarRatio")

pander::pander(tail(varratio))
La_X1468 La_X1455 La_X1450 La_X1463 La_X1451 La_X1461
0.000696 0.000671 0.000482 0.000462 0.000434 0.000428

1.5.1 The decorrelation matrix


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  
  UPLTM <- attr(DEdataframe,"UPLTM")
  
  gplots::heatmap.2(1.0*(abs(UPLTM)>0),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Decorrelation matrix",
                    cexRow = cexheat,
                    cexCol = cexheat,
                   srtCol=45,
                   srtRow=45,
                    key.title=NA,
                    key.xlab="|Beta|>0",
                    xlab="Output Feature", ylab="Input Feature")
  
  par(op)
  
  
  
}

1.5.2 Formulas Network

Displaying the features associations

par(op)


#if ((ncol(dataframe) < 1000) && (ncol(dataframe) > 10))
#{

#  DEdataframeB <- ILAA(dataframe,verbose=TRUE,thr=thro,bootstrap=30)

  transform <- attr(DEdataframe,"UPLTM") != 0
  tnames <- colnames(transform)
  colnames(transform) <- str_remove_all(colnames(transform),"La_")
  transform <- abs(transform*cor(dataframe[,rownames(transform)])) # The weights are proportional to the observed correlation
  
  
  VertexSize <- attr(DEdataframe,"fscore") # The size depends on the variable independence relevance (fscore)
  names(VertexSize) <- str_remove_all(names(VertexSize),"La_")
  VertexSize <- 10*(VertexSize-min(VertexSize))/(max(VertexSize)-min(VertexSize)) # Normalization

  VertexSize <- VertexSize[rownames(transform)]
  rsum <- apply(1*(transform !=0),1,sum) + 0.01*VertexSize + 0.001*varratio[tnames]
  csum <- apply(1*(transform !=0),2,sum) + 0.01*VertexSize + 0.001*varratio[tnames]
  
  ntop <- min(10,length(rsum))


  topfeatures <- unique(c(names(rsum[order(-rsum)])[1:ntop],names(csum[order(-csum)])[1:ntop]))
  rtrans <- transform[topfeatures,]
  csum <- (apply(1*(rtrans !=0),2,sum) > 1)
  rtrans <- rtrans[,csum]
  topfeatures <- unique(c(topfeatures,colnames(rtrans)))
  print(ncol(transform))
#> [1] 866
  transform <- transform[topfeatures,topfeatures]
  print(ncol(transform))
#> [1] 22
  if (ncol(transform)>100)
  {
    csum <- (apply(1*(transform !=0),2,sum) > 1) & (apply(1*(transform !=0),1,sum) > 1)
    transform <- transform[csum,csum]
    print(ncol(transform))
  }

    if (ncol(transform) < 150)
    {

      gplots::heatmap.2(transform,
                        trace = "none",
                        mar = c(5,5),
                        col=rev(heat.colors(5)),
                        main = "Red Decorrelation matrix",
                        cexRow = cexheat,
                        cexCol = cexheat,
                       srtCol=45,
                       srtRow=45,
                        key.title=NA,
                        key.xlab="|Beta|>0",
                        xlab="Output Feature", ylab="Input Feature")
  
      par(op)
      VertexSize <- VertexSize[colnames(transform)]
      gr <- graph_from_adjacency_matrix(transform,mode = "directed",diag = FALSE,weighted=TRUE)
      gr$layout <- layout_with_fr
      
      fc <- cluster_optimal(gr)
      plot(fc, gr,
           edge.width = 2*E(gr)$weight,
           vertex.size=VertexSize,
           edge.arrow.size=0.5,
           edge.arrow.width=0.5,
           vertex.label.cex=(0.15+0.05*VertexSize),
           vertex.label.dist=0.5 + 0.05*VertexSize,
           main="Top Feature Association")
      
    }


par(op)

1.6 The heatmap of the decorrelated data

if (!largeSet)
{

  hm <- heatMaps(data=DEdataframe[1:numsub,],
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 cexRow = cexheat,
                 cexCol = cexheat,
                 srtCol=45,
                 srtRow=45,
                 xlab="Feature",
                 ylab="Sample")
  par(op)
}

1.7 The correlation matrix after decorrelation

if (!largeSet)
{

  cormat <- cor(DEdataframe[,varlistc],method="pearson")
  cormat[is.na(cormat)] <- 0
  
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Correlation after ILAA",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="|Pearson Correlation|",
                    xlab="Feature", ylab="Feature")
  
  par(op)
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

[1] 0.799788

1.8 U-MAP Visualization of features

1.8.1 The UMAP on Raw Data


  classes <- unique(dataframe[1:numsub,outcome])
  raincolors <- rainbow(length(classes))
  names(raincolors) <- classes
  topvars <- univariate_BinEnsemble(dataframe,outcome)
  lso <- LASSO_MIN(formula(paste(outcome,"~.")),dataframe,family="binomial")
  topvars <- unique(c(names(topvars),lso$selectedfeatures))
  pander::pander(head(topvars))

X1255, X1757, X1761, X1762, X1780 and X1782

#  names(topvars)
#if (nrow(dataframe) < 1000)
#{
  datasetframe.umap = umap(scale(dataframe[1:numsub,topvars]),n_components=2)
#  datasetframe.umap = umap(dataframe[1:numsub,varlist],n_components=2)
  plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
  text(datasetframe.umap$layout,labels=dataframe[1:numsub,outcome],col=raincolors[dataframe[1:numsub,outcome]+1])

#}

1.8.2 The decorralted UMAP

  varlistcV <- names(varratio[varratio >= 0.025])
  topvars <- univariate_BinEnsemble(DEdataframe[,varlistcV],outcome)
  lso <- LASSO_MIN(formula(paste(outcome,"~.")),DEdataframe,family="binomial")
  topvars <- unique(c(names(topvars),lso$selectedfeatures))
  pander::pander(head(topvars))

X1255, X2082, X1942, X2056, La_X1201 and La_X1353


  varlistcV <- varlistcV[varlistcV != outcome]
  
#  DEdataframe[,outcome] <- as.numeric(DEdataframe[,outcome])
#if (nrow(dataframe) < 1000)
#{
  datasetframe.umap = umap(scale(DEdataframe[1:numsub,topvars]),n_components=2)
#  datasetframe.umap = umap(DEdataframe[1:numsub,varlistcV],n_components=2)
  plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After ILAA",t='n')
  text(datasetframe.umap$layout,labels=DEdataframe[1:numsub,outcome],col=raincolors[DEdataframe[1:numsub,outcome]+1])

#}

1.9 Univariate Analysis

1.9.1 Univariate



univarRAW <- uniRankVar(varlist,
               paste(outcome,"~1"),
               outcome,
               dataframe,
               rankingTest="AUC")

100 : X641 200 : X869 300 : X1088 400 : X1293 500 : X1486
600 : X1667 700 : X1833 800 : X1985




univarDe <- uniRankVar(varlistc,
               paste(outcome,"~1"),
               outcome,
               DEdataframe,
               rankingTest="AUC",
               )

100 : La_X641 200 : La_X869 300 : X1088 400 : La_X1293 500 : X1486
600 : La_X1667 700 : La_X1833 800 : La_X1985

1.9.2 Final Table


univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")

##top variables
topvar <- c(1:length(varlist)) <= TopVariables
tableRaw <- univarRAW$orderframe[topvar,univariate_columns]
pander::pander(tableRaw)
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
X2078 -0.00831 0.000859 -0.00645 0.000455 0.990 0.944
X2045 -0.01737 0.000858 -0.01574 0.000713 0.388 0.937
X2080 -0.00870 0.000957 -0.00671 0.000464 0.985 0.936
X2086 -0.01137 0.001083 -0.00908 0.000517 0.783 0.935
X2046 -0.01814 0.000941 -0.01636 0.000604 0.882 0.935
X2077 -0.00826 0.000843 -0.00674 0.000458 0.959 0.934
X2085 -0.01060 0.001052 -0.00825 0.000465 0.799 0.934
X2088 -0.01167 0.000953 -0.00970 0.000500 0.898 0.932
X2048 -0.01880 0.000968 -0.01698 0.000660 0.672 0.928
X2049 -0.01907 0.001017 -0.01718 0.000782 0.868 0.926


topLAvar <- univarDe$orderframe$Name[str_detect(univarDe$orderframe$Name,"La_")]
topLAvar <- unique(c(univarDe$orderframe$Name[topvar],topLAvar[1:as.integer(TopVariables/2)]))
finalTable <- univarDe$orderframe[topLAvar,univariate_columns]


pander::pander(finalTable)
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
La_X1575 -0.008555 0.001137 -0.01036 0.000625 0.973 0.922
X2082 -0.009254 0.001368 -0.00729 0.000440 0.909 0.918
La_X415 0.000796 0.001516 -0.00160 0.000915 0.338 0.907
La_X969 0.001040 0.002240 0.00537 0.002658 0.616 0.907
La_X598 -0.001178 0.000705 -0.00241 0.000643 0.219 0.904
La_X407 -0.008368 0.002743 -0.01202 0.001372 0.395 0.902
La_X1279 0.011742 0.000975 0.01012 0.000804 1.000 0.902
La_X1027 0.007151 0.000999 0.00552 0.000784 0.707 0.898
La_X1201 -0.006897 0.000999 -0.00870 0.001005 0.769 0.898
X2056 -0.021217 0.001332 -0.01900 0.001003 0.709 0.896

dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")


pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
mean total fraction
2.29 816 0.917

theCharformulas <- attr(dc,"LatentCharFormulas")


finalTable <- rbind(finalTable,tableRaw[topvar[!(topvar %in% topLAvar)],univariate_columns])


orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- theCharformulas[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]

Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores")

finalTable <- finalTable[order(-finalTable$ROCAUC),]
pander::pander(finalTable[,Final_Columns])
  DecorFormula caseMean caseStd controlMean controlStd controlKSP ROCAUC RAWAUC fscores
X2078 NA -0.008314 0.000859 -0.00645 0.000455 0.990 0.944 0.944 NA
X2045 NA -0.017370 0.000858 -0.01574 0.000713 0.388 0.937 0.937 NA
X2080 NA -0.008703 0.000957 -0.00671 0.000464 0.985 0.936 0.936 NA
X2086 NA -0.011371 0.001083 -0.00908 0.000517 0.783 0.935 0.935 NA
X2046 NA -0.018144 0.000941 -0.01636 0.000604 0.882 0.935 0.935 NA
X2077 NA -0.008257 0.000843 -0.00674 0.000458 0.959 0.934 0.934 NA
X2085 NA -0.010600 0.001052 -0.00825 0.000465 0.799 0.934 0.934 NA
X2088 NA -0.011674 0.000953 -0.00970 0.000500 0.898 0.932 0.932 NA
X2048 NA -0.018797 0.000968 -0.01698 0.000660 0.672 0.928 0.928 NA
X2049 NA -0.019069 0.001017 -0.01718 0.000782 0.868 0.926 0.926 NA
La_X1575 + X1575 - (0.979)X1584 -0.008555 0.001137 -0.01036 0.000625 0.973 0.922 0.510 -1
X2082 NA -0.009254 0.001368 -0.00729 0.000440 0.909 0.918 0.918 0
La_X415 + X415 - (1.502)X417 + (0.492)X420 0.000796 0.001516 -0.00160 0.000915 0.338 0.907 0.731 -2
La_X969 - (1.406)X967 + X969 0.001040 0.002240 0.00537 0.002658 0.616 0.907 0.595 -1
La_X598 + X598 - (0.997)X601 -0.001178 0.000705 -0.00241 0.000643 0.219 0.904 0.574 -1
La_X407 + X407 - (0.805)X420 -0.008368 0.002743 -0.01202 0.001372 0.395 0.902 0.763 1
La_X1279 + X1279 - (1.078)X1283 0.011742 0.000975 0.01012 0.000804 1.000 0.902 0.561 0
La_X1027 + X1027 - (0.716)X1029 0.007151 0.000999 0.00552 0.000784 0.707 0.898 0.674 -1
La_X1201 - (0.536)X1187 + X1201 -0.006897 0.000999 -0.00870 0.001005 0.769 0.898 0.710 1
X2056 NA -0.021217 0.001332 -0.01900 0.001003 0.709 0.896 0.896 7

1.10 Comparing ILAA vs PCA vs EFA

1.10.1 PCA

featuresnames <- colnames(dataframe)[colnames(dataframe) != outcome]
pc <- prcomp(dataframe[,iscontinous],center = TRUE,scale. = TRUE)   #principal components
predPCA <- predict(pc,dataframe[,iscontinous])
PCAdataframe <- as.data.frame(cbind(predPCA,dataframe[,!iscontinous]))
colnames(PCAdataframe) <- c(colnames(predPCA),colnames(dataframe)[!iscontinous]) 
#plot(PCAdataframe[,colnames(PCAdataframe)!=outcome],col=dataframe[,outcome],cex=0.65,cex.lab=0.5,cex.axis=0.75,cex.sub=0.5,cex.main=0.75)

#pander::pander(pc$rotation)


PCACor <- cor(PCAdataframe[,colnames(PCAdataframe) != outcome])


  gplots::heatmap.2(abs(PCACor),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "PCA Correlation",
                    cexRow = 0.5,
                    cexCol = 0.5,
                     srtCol=45,
                     srtRow= -45,
                    key.title=NA,
                    key.xlab="Pearson Correlation",
                    xlab="Feature", ylab="Feature")

1.10.2 EFA


EFAdataframe <- dataframeScaled

if (length(iscontinous) < 2000)
{
  topred <- min(length(iscontinous),nrow(dataframeScaled),ncol(predPCA)/2)
  if (topred < 2) topred <- 2
  
  uls <- fa(dataframeScaled[,iscontinous],nfactors=topred,rotate="varimax",warnings=FALSE)  # EFA analysis
  predEFA <- predict(uls,dataframeScaled[,iscontinous])
  EFAdataframe <- as.data.frame(cbind(predEFA,dataframeScaled[,!iscontinous]))
  colnames(EFAdataframe) <- c(colnames(predEFA),colnames(dataframeScaled)[!iscontinous]) 


  
  EFACor <- cor(EFAdataframe[,colnames(EFAdataframe) != outcome])
  
  
    gplots::heatmap.2(abs(EFACor),
                      trace = "none",
    #                  scale = "row",
                      mar = c(5,5),
                      col=rev(heat.colors(5)),
                      main = "EFA Correlation",
                      cexRow = 0.5,
                      cexCol = 0.5,
                       srtCol=45,
                       srtRow= -45,
                      key.title=NA,
                      key.xlab="Pearson Correlation",
                      xlab="Feature", ylab="Feature")
}

1.11 Effect on CAR modeling

par(op)
par(xpd = TRUE)
dataframe[,outcome] <- factor(dataframe[,outcome])
rawmodel <- rpart(paste(outcome,"~."),dataframe,control=rpart.control(maxdepth=3))
pr <- predict(rawmodel,dataframe,type = "class")

  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(rawmodel,main="Raw",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(rawmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,dataframe[,outcome]==0))
  }


pander::pander(table(dataframe[,outcome],pr))
  0 1
0 50 0
1 6 47
pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.942 0.878 0.978
3 se 0.887 0.770 0.957
4 sp 1.000 0.929 1.000
6 diag.or Inf NA Inf

par(op)
par(xpd = TRUE)
DEdataframe[,outcome] <- factor(DEdataframe[,outcome])
IDeAmodel <- rpart(paste(outcome,"~."),DEdataframe[,c(outcome,varlistcV)],control=rpart.control(maxdepth=3))
pr <- predict(IDeAmodel,DEdataframe,type = "class")

  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(IDeAmodel,main="ILAA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(IDeAmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,DEdataframe[,outcome]==0))
  }

pander::pander(table(DEdataframe[,outcome],pr))
  0 1
0 50 0
1 6 47
pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.942 0.878 0.978
3 se 0.887 0.770 0.957
4 sp 1.000 0.929 1.000
6 diag.or Inf NA Inf

par(op)
par(xpd = TRUE)
PCAdataframe[,outcome] <- factor(PCAdataframe[,outcome])
PCAmodel <- rpart(paste(outcome,"~."),PCAdataframe,control=rpart.control(maxdepth=3))
pr <- predict(PCAmodel,PCAdataframe,type = "class")
ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
if (length(unique(pr))>1)
{
  plot(PCAmodel,main="PCA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
  text(PCAmodel, use.n = TRUE,cex=0.75)
  ptab <- epiR::epi.tests(table(pr==0,PCAdataframe[,outcome]==0))
}

pander::pander(table(PCAdataframe[,outcome],pr))
  0 1
0 44 6
1 4 49
pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.903 0.829 0.952
3 se 0.925 0.818 0.979
4 sp 0.880 0.757 0.955
6 diag.or 89.833 23.782 339.333


par(op)

1.11.1 EFA


  EFAdataframe[,outcome] <- factor(EFAdataframe[,outcome])
  EFAmodel <- rpart(paste(outcome,"~."),EFAdataframe,control=rpart.control(maxdepth=3))
  pr <- predict(EFAmodel,EFAdataframe,type = "class")
  
  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(EFAmodel,main="EFA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(EFAmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,EFAdataframe[,outcome]==0))
  }


  pander::pander(table(EFAdataframe[,outcome],pr))
  0 1
0 38 12
1 3 50
  pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.854 0.771 0.916
3 se 0.943 0.843 0.988
4 sp 0.760 0.618 0.869
6 diag.or 52.778 13.908 200.278
  par(op)